1 Core genome MLST schema and quality control

The E. coli scheme from Enterobase [1] was used with chewBBACA [2] to get the cgMLST profile for each isolate. The number of missing loci in total (sum of LNF, NIPH, PLOT, ALM and ASM from chewBBACA) is plotted below for all isolates.

# Vector of excluded samples
ex_samples <- c(
  "2016-17-565-1-S",
  "2016-02-428-2-S",
  "2016-02-486-2-S",
  "2016-02-732-2-S",
  "2014-01-1741-1-S"
  )

# Collapses data frame to unique lines while ignoring NA's
func_paste <- function(x) paste(unique(x[!is.na(x)]), collapse = ", ")

"%not_in%" <- Negate("%in%")


# Isolate data
id_data <- read.table("../data/id.txt",
                      sep = "\t",
                      header = TRUE,
                      stringsAsFactors = F)

isolate_data <- read.table("../data/isolate_data.txt",
                           sep = "\t",
                           header = T,
                           stringsAsFactors = F) %>%
  filter(id %not_in% ex_samples)


# cgMLST data
cgmlst_stats <- read.table("../data/chewbbaca/complete_results/mdata_stats.tsv",
                         sep = "\t",
                         header = TRUE,
                         stringsAsFactors = FALSE)

detailed_cgmlst_stats <- read.table("../data/chewbbaca/results_statistics.tsv",
                                    sep = "\t",
                                    header = TRUE,
                                    stringsAsFactors = FALSE)

cgMLST_allele_data <- read.table("../data/chewbbaca/complete_results/cgMLST.tsv",
                              sep = "\t",
                              header = TRUE,
                              stringsAsFactors = FALSE) %>%
  mutate(FILE = sub("_pilon_spades.fasta", "", FILE)) %>%
  dplyr::rename("ref" = FILE) %>%
  left_join(id_data, by = "ref") %>%
  select(id, everything(), -ref) %>%
  filter(id %not_in% ex_samples)
boxplot(cgmlst_stats$number.of.missing.data,
        ylab = "Number of missing loci")

Below follows the detailed results for all loci in the cgMLST scheme. EXC = excact matches, INF = novel allele, LNF = locus not found, NIPH = paralogous locus, PLOT = possible loci on tip of contig, ALM = allele 20 % larger than reference, ASM = allele 20 % smaller than reference. The numbers below each boxplot denote the mean.

test <- detailed_cgmlst_stats %>%
  gather(metric, value, -Genome) %>%
  group_by(metric) %>%
  mutate(mean_val = round(mean(value), 1))

ggplot(test, aes(metric, value)) +
  stat_boxplot(geom = "errorbar", width = 0.5) +
  geom_boxplot() +
  geom_text(data = subset(test,
                          Genome == "2006-01-2184_S30_pilon_spades.fasta"),
            aes(label = mean_val),
            y = -60,
            size = 3.8) +
  scale_x_discrete(limits = c("EXC",
                              "INF",
                              "LNF",
                              "NIPH",
                              "PLOT",
                              "ALM",
                              "ASM")) +
  labs(y = "Number of loci") +
  theme(axis.title.x = element_blank(),
        axis.ticks.x = element_blank())

2 Total dendrogram of all isolates

Genetic distances were calculated from the allele data from the cgMLST report, and a dendrogram was drawn from these distances. The annotations are based on data from the ARIBA resistance gene analysis.

cgMLST_data <- read.table("../data/chewbbaca/complete_results/clean_cgMLST_data.txt",
                   sep = "\t",
                   header = TRUE,
                   colClasses = "factor")

total_tree <- as.phylo(hclust(daisy(cgMLST_data, metric = "gower"), method = "average"))


tree_metadata <- read.table("../data/chewbbaca/complete_results/tree_metadata.txt",
                            sep = "\t",
                            header = TRUE)

tree_heatmap_data <- read.table("../data/chewbbaca/complete_results/tree_heatmap_data.txt",
                            sep = "\t",
                            header = TRUE) %>%
  select(-gadX)

palette <- c("Broiler" = "#4575b4",
             "Pig" = "#74add1",
             "Red fox" = "#f46d43",
             "Wild bird" = "#fdae61")

palette2 <- c("1-A" = "#fc8d59",
              "1-I" = "#80b1d3",
              "0" = "grey95")

total_tree_annotated <- ggtree(total_tree,
                               layout = "circular",
                               color = "#808080",
                               size = 0.5) %<+% tree_metadata +
                               geom_tippoint(aes(color = species),
                               size = 3) +
                               geom_tiplab2(aes(label = ST),
                               offset = 0.01,
                               size = 4) +
  scale_color_manual(values = palette)

total_tree_opened <- rotate_tree(open_tree(total_tree_annotated, 8), 93)

gheatmap(total_tree_opened,
         tree_heatmap_data,
         offset = 0.04,
         width = 0.3,
         colnames_offset_y = 2.5,
         colnames_position = "top",
         font.size = 5) +
  scale_fill_manual(values = palette2,
                    labels = c("Gene/Mutation absent",
                               "Acquired gene present",
                               "Intrinsic gene mutated")) +
  theme(legend.position = c(0.9,0.9),
        legend.text = element_text(size = 20))

3 Dendrograms per animal species

metadata_species <- read.table("../data/chewbbaca/complete_results/metadata_species_specific_trees.txt",
                               sep = "\t",
                               header = TRUE)

palette3 <- c("0" = "grey95",
              # gyrA
              "S83L" = "#c6dbef",
              "S83A" = "#9ecae1",
              "D87N" = "#6baed6",
              "D87H" = "#4292c6",
              "D87Y" = "#2171b5",
              "D87G" = "#08519c",
              "S83L, D87N" = "#08306b",
              # parC
              "S80I" = "#a1d99b",
              "S80R" = "#74c476",
              "S57T" = "#41ab5d",
              "A56T, S80I" = "#238b45",
              "S80I, E84V" = "#005a32",
              # parE
              "D475E" = "#fdd0a2",
              "D463N" = "#fdae6b",
              "S458A" = "#fd8d3c",
              "L416F" = "#f16913",
              "H516Y" = "#d94801",
              "L488M, A512T" = "#8c2d04",
              # PMQR
              "qnrA1" = "#762a83",
              "qnrB19" = "#ef8a62",
              "qnrS1" = "#a6dba0",
              "qnrS2" = "#5aae61",
              "qnrS4" = "#1b7837",
              "qepA4" = "#80cdc1",
              # num
              "1" = "#440154FF",
              "2" = "#3B528BFF",
              "3" = "#21908CFF",
              "4" = "#5DC863FF",
              "5" = "#FDE725FF")

# Broiler tree
broiler_cgMLST <- read.table("../data/chewbbaca/complete_results/broiler_cgmlst.txt",
                   sep = "\t",
                   header = TRUE,
                   colClasses = "factor")

broiler_tree <- as.phylo(hclust(daisy(broiler_cgMLST, metric = "gower"), method = "average"))

broiler_tree_annotated <- ggtree(broiler_tree,
                                 layout = "circular",
                                 color = "#808080",
                                 size = 0.5) %<+% tree_metadata +
  geom_tippoint(color = "#4575b4",
                size = 4) +
  geom_tiplab2(aes(label = ST),
               offset = 0.01,
               size = 6)

broiler_tree_opened <- rotate_tree(open_tree(broiler_tree_annotated, 10), 92.5)

broiler_complete <- gheatmap(broiler_tree_opened,
         metadata_species,
         offset = 0.1,
         width = 0.5,
         colnames_offset_y = 0.75,
         colnames_position = "top",
         font.size = 5) +
  scale_fill_manual(values = palette3) +
  guides(fill = FALSE)

# Pig tree
pig_cgMLST <- read.table("../data/chewbbaca/complete_results/pig_cgmlst.txt",
                   sep = "\t",
                   header = TRUE,
                   colClasses = "factor")

pig_tree <- as.phylo(hclust(daisy(pig_cgMLST, metric = "gower"), method = "average"))

pig_tree_annotated <- ggtree(pig_tree,
                                 layout = "circular",
                                 color = "#808080",
                                 size = 0.5) %<+% tree_metadata +
  geom_tippoint(color = "#74add1",
                size = 4) +
  geom_tiplab2(aes(label = ST),
               offset = 0.01,
               size = 5.5)

pig_tree_opened <- rotate_tree(open_tree(pig_tree_annotated, 10), 92.5)

pig_complete <- gheatmap(pig_tree_opened,
         metadata_species,
         offset = 0.09,
         width = 0.5,
         colnames_offset_y = 0.53,
         colnames_position = "top",
         font.size = 5) +
  scale_fill_manual(values = palette3) +
  guides(fill = FALSE)

# Red fox tree
fox_cgMLST <- read.table("../data/chewbbaca/complete_results/fox_cgmlst.txt",
                   sep = "\t",
                   header = TRUE,
                   colClasses = "factor")

fox_tree <- as.phylo(hclust(daisy(fox_cgMLST, metric = "gower"), method = "average"))

fox_tree_annotated <- ggtree(fox_tree,
                                 layout = "circular",
                                 color = "#808080",
                                 size = 0.5) %<+% tree_metadata +
  geom_tippoint(color = "#f46d43",
                size = 4) +
  geom_tiplab2(aes(label = ST),
               offset = 0.01,
               size = 5.5)

fox_tree_opened <- rotate_tree(open_tree(fox_tree_annotated, 10), 92)

fox_complete <- gheatmap(fox_tree_opened,
         metadata_species,
         offset = 0.09,
         width = 0.5,
         colnames_offset_y = 0.17,
         colnames_position = "top",
         font.size = 5) +
  scale_fill_manual(values = palette3) +
  guides(fill = FALSE)

# Wild bird tree
bird_cgMLST <- read.table("../data/chewbbaca/complete_results/bird_cgmlst.txt",
                   sep = "\t",
                   header = TRUE,
                   colClasses = "factor")

bird_tree <- as.phylo(hclust(daisy(bird_cgMLST, metric = "gower"), method = "average"))

bird_tree_annotated <- ggtree(bird_tree,
                                 layout = "circular",
                                 color = "#808080",
                                 size = 0.5) %<+% tree_metadata +
  geom_tippoint(color = "#fdae61",
                size = 4) +
  geom_tiplab2(aes(label = ST),
               offset = 0.01,
               size = 5.5)

bird_tree_opened <- rotate_tree(open_tree(bird_tree_annotated, 10), 92.5)

bird_complete <- gheatmap(bird_tree_opened,
         metadata_species,
         offset = 0.09,
         width = 0.5,
         colnames_offset_y = 0.38,
         colnames_position = "top",
         font.size = 5) +
  scale_fill_manual(values = palette3) +
  guides(fill = FALSE)

# Total plot
tot_plot <- plot_grid(broiler_complete,
          pig_complete,
          fox_complete,
          bird_complete,
          nrow = 2,
          ncol = 2,
          align = "h")

include_graphics("../figures/species_dendrogram.png")

4 Statistics

4.1 Distribution of calculated distances per animal species

Below follows a plot of the distribution of the lower triangular distances for each animal species. A bimodal distribution is present in all four plots. The filtering values of < 0.1 and > 0.8 is used for downstream statistics to acertain whether the broiler isolates are more homogenous than the isolates from other species.

# calculates distances from cgMLST data and returns
# the lower triangular of the matrix, with (diag = FALSE)
# or without (diag = TRUE) the diagonal
lower_tri_dist_calc <- function(data, diag = TRUE, data_frame = FALSE) {
  dist <- as.matrix(daisy(data, metric = "gower"))
  if (diag == TRUE) {
    dist[upper.tri(dist, diag = TRUE)] <- NA
  } else {
    dist[upper.tri(dist, diag = FALSE)] <- NA
  }
  dist_vec <- as.vector(as.matrix(dist))
  dist_compl <- dist_vec[!is.na(dist_vec)]
  
  if (data_frame == TRUE) {
    dist_compl <- as.data.frame(as.matrix(dist))
  }
  return(dist_compl)
}

broiler_dist <- lower_tri_dist_calc(broiler_cgMLST)
pig_dist <- lower_tri_dist_calc(pig_cgMLST)
fox_dist <- lower_tri_dist_calc(fox_cgMLST)
bird_dist <- lower_tri_dist_calc(bird_cgMLST)


# Distance plots
par(mfrow = c(2,2))

x1 <- hist(broiler_dist,
     breaks = 20,
     plot = FALSE)

x1$density = x1$counts/sum(x1$counts)*100

plot(x1,
     freq = FALSE,
     main = "Broiler distance frequency",
     xlab = "Distance",
     ylim = c(0, 50))


x2 <- hist(pig_dist,
     breaks = 20,
     plot = FALSE)

x2$density = x2$counts/sum(x2$counts)*100

plot(x2,
     freq = FALSE,
     main = "Pig distance frequency",
     xlab = "Distance",
     ylim = c(0, 50))

x3 <- hist(fox_dist,
     breaks = 20,
     plot = FALSE)

x3$density = x3$counts/sum(x3$counts)*100

plot(x3,
     freq = FALSE,
     main = "Red fox distance frequency",
     xlab = "Distance",
     ylim = c(0, 50))


x4 <- hist(bird_dist,
     breaks = 20,
     plot = FALSE)

x4$density = x4$counts/sum(x4$counts)*100

plot(x4,
     freq = FALSE,
     main = "Wild bird distance frequency",
     xlab = "Distance",
     ylim = c(0, 50))

4.2 Distribution of iterated distances

Distribution of percentages of distances < 0.1 or > 0.8. The distribution is based on 1000 iterations of the distance data. Observed values are denoted as arrows for each animal species; dark blue = Broiler, light blue = Pig, red = Red fox, yellow = Wild bird. The mean value is denoted as a vertical black line.

# generate data frame from allele data
alleles <- cgMLST_allele_data %>%
  left_join(isolate_data[,c("id","species")], by = "id") %>%
  group_by(species) %>%
  mutate(n = 1:n(),
         new_id = case_when(species == "Broiler" ~ paste("BR", n, sep = "_"),
                            species == "Pig" ~ paste("PI", n, sep = "_"),
                            species == "Red fox" ~ paste("RF", n, sep = "_"),
                            species == "Wild bird" ~ paste("WB", n, sep = "_"))) %>%
  ungroup() %>%
  select(new_id, everything(), -c(species, id)) %>%
  mutate_all(funs(as.factor)) %>%
  column_to_rownames("new_id")

# generate data frame for iteration analysis
run_df <- lower_tri_dist_calc(alleles, data_frame = TRUE) %>%
  rownames_to_column("id") %>%
  gather(isolate1, value, -id) %>%
  dplyr::rename("isolate2" = id) %>%
  mutate(group1 = substr(isolate1, start = 1, stop = 2),
         group2 = substr(isolate2, start = 1, stop = 2)) %>%
  select(isolate1, group1, isolate2, group2, value) %>%
  filter(is.na(value) == FALSE,
         group1 == group2) %>%
  arrange(group1, group2)

# functions used in iteration analyses

## Calculates the sum of observations lower than dist_value1 and
## bigger than dist_value2, and calculates the percentage of
## observed values for each group
without <- function(df, run, dist_value1 = 0.1, dist_value2 = 0.8) {
  df <- df %>%
    group_by(group1, group2) %>%
    mutate(n = value < dist_value1 | value > dist_value2) %>%
    group_by(group1, group2, n) %>%
    count() %>%
    ungroup() %>%
    mutate(n = ifelse(n == TRUE, "Within", "Without")) %>%
    spread(n, nn) %>%
    mutate(Percent = round(Within / (Within + Without) * 100, 2),
           iter = run)
  return(df)
}

## Calculates the sum of observations between dist_value1 and dist_value2
## and calculates the percentage of observed values for each group
within <- function(df, run, dist_value1 = 0.1, dist_value2 = 0.8) {
  df <- df %>%
    group_by(group1, group2) %>%
    mutate(n = value > dist_value1 & value < dist_value2) %>%
    group_by(group1, group2, n) %>%
    count() %>%
    ungroup() %>%
    mutate(n = ifelse(n == TRUE, "Within", "Without")) %>%
    spread(n, nn) %>%
    mutate(Percent = round(Within / (Within + Without) * 100, 2),
           iter = run)
  return(df)
}

## Calculates the sum of observations below dist_value1
## for each group
below_calc <- function(df, run, dist_value1 = 0.1) {
  df <- df %>%
    group_by(group1, group2) %>%
    mutate(n = value < dist_value1) %>%
    group_by(group1, group2, n) %>%
    count() %>%
    ungroup() %>%
    mutate(n = ifelse(n == TRUE, "Below", "Above")) %>%
    spread(n, nn) %>%
    mutate(Below = ifelse(is.na(Below) == TRUE, 0, Below)) %>%
    mutate(Percent_below = round(Below / (Above + Below) * 100, 2),
           iter = run)
  return(df)
}

## Calculates the sum of observations above dist_value1
## for each group
above_calc <- function(df, run, dist_value1 = 0.9) {
  df <- df %>%
    group_by(group1, group2) %>%
    mutate(n = value > dist_value1) %>%
    group_by(group1, group2, n) %>%
    count() %>%
    ungroup() %>%
    mutate(n = ifelse(n == TRUE, "Above", "Below")) %>%
    spread(n, nn) %>%
    mutate(Above = ifelse(is.na(Above) == TRUE, 0, Above)) %>%
    mutate(Percent_above = round(Above / (Above + Below) * 100, 2),
           iter = run)
  return(df)
}

## Calculates the mean of observations for each group
mean_calc <- function(df, run) {
  df <- df %>%
    group_by(group1, group2) %>%
    mutate(metric = round(mean(value), 2)) %>%
    select(group1, group2, metric) %>%
    summarise_all(funs(func_paste)) %>%
    mutate(iter = run,
           metric = as.numeric(metric))
}

## Calculates the median of observations for each group
median_calc <- function(df, run) {
  df <- df %>%
    group_by(group1, group2) %>%
    mutate(metric = round(median(value), 2)) %>%
    select(group1, group2, metric) %>%
    summarise_all(funs(func_paste)) %>%
    mutate(iter = run,
           metric = as.numeric(metric))
}

## Calculates the quantiles of the observations for each group
quantile_calc <- function(df, run) {
  df <- df %>%
    group_by(group1, group2) %>%
    mutate(quant_0 = quantile(value)[1],
           quant_25 = quantile(value)[2],
           quant_50 = quantile(value)[3],
           quant_75 = quantile(value)[4],
           quant_100 = quantile(value)[5]) %>%
    select(group1, group2, contains("quant")) %>%
    summarise_all(funs(func_paste)) %>%
    mutate_at(vars(contains("quant")),
              funs(as.numeric)) %>%
    mutate(iter = run)
}

# Iteration function
iterate_samples <- function(df, run, method = "mean", dist_value1, dist_value2) {

  # randomize the value column, regardless of groups
  df <- df %>%
    mutate(value = sample(value))

  # run specified function
  if (method == "mean") {
    df <- mean_calc(df, run)
    }

  if (method == "below") {
   df <- below_calc(df, run, dist_value1)
    }

  if (method == "above") {
   df <- above_calc(df, run, dist_value1)
    }

  if (method == "within") {
    df <- within(df, run, dist_value1, dist_value2)
    }

  if (method == "without") {
    df <- without(df, run, dist_value1, dist_value2)
    }
  
  if (method == "median") {
    df <- median_calc(df, run)
    }
  
  if (method == "quantile") {
    df <- quantile_calc(df, run)
  }
    
    return(df)
}

# run iterations on selected data frame
run_iterations <- function(df, runs = 1000, seed = 10, method = "mean", dist_value1, dist_value2) {
  # set seed
  set.seed(seed)
  
  # create output list
  output <- list()
  
  # create expected values
  if (method == "mean") {
    orig <- mean_calc(df, 0)
    }

  if (method == "below") {
   orig <- below_calc(df, 0, dist_value1)
    }

  if (method == "above") {
   orig <- above_calc(df, 0, dist_value1)
    }
  
  if (method == "within") {
    orig <- within(df, 0, dist_value1, dist_value2)
    }

  if (method == "without") {
    orig <- without(df, 0, dist_value1, dist_value2)
    }

  if (method == "median") {
    orig <- median_calc(df, 0)
    }

  if (method == "quantile") {
    orig <- quantile_calc(df, 0)
  }

  output <- c(output, list(orig))
  
  # run iterations
  for (i in 1:runs) {
    result <- iterate_samples(df,
                              i,
                              method = method,
                              dist_value1 = dist_value1,
                              dist_value2 = dist_value2)
    output <- c(output, list(result))
    }
  return(output)
}

# Run functions
summary_stats <- run_df %>%
  mutate(total_mean = round(mean(value)*100, 3)) %>%
  group_by(group1) %>%
  mutate(mean = round(mean(value)*100, 3),
         var = round(var(value)*100, 3)) %>%
  select(group1, total_mean, mean, var) %>%
  summarise_all(funs(func_paste))

summary_stats2 <- run_df %>%
  mutate(total_mean = round(mean(value), 3)) %>%
  group_by(group1) %>%
  mutate(mean = round(mean(value), 3),
         var = round(var(value), 3)) %>%
  select(group1, total_mean, mean, var) %>%
  summarise_all(funs(func_paste))

complete_data <- run_iterations(run_df,
                                runs = 1000,
                                method = "without",
                                dist_value1 = 0.1,
                                dist_value2 = 0.8) %>%
  bind_rows() %>%
  mutate(Percent = as.numeric(Percent))

lower_data <- run_iterations(run_df,
                             runs = 1000,
                             method = "below",
                             dist_value1 = 0.1) %>%
  bind_rows() %>%
  mutate(Percent_below = as.numeric(Percent_below))

# Plots

ggplot(complete_data, aes(Percent)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.2, color = "black",
                fill = "grey80") +
  stat_function(fun = dnorm,
                args = with(complete_data, c(mean = mean(Percent), sd = sd(Percent))),
                color = "red") +
  geom_segment(aes(x = complete_data$Percent[1],
                   xend = complete_data$Percent[1], y = 0.08, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#4575b4") +
  geom_segment(aes(x = complete_data$Percent[2],
                   xend = complete_data$Percent[2], y = 0.08, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#74add1") +
  geom_segment(aes(x = complete_data$Percent[3],
                   xend = complete_data$Percent[3], y = 0.08, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#f46d43") +
  geom_segment(aes(x = complete_data$Percent[4],
                   xend = complete_data$Percent[4], y = 0.08, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#fdae61") +
  geom_text(label = "p < 0.001",
            x = 83.30,
            y = 0.16,
            angle = 90) +
  geom_vline(xintercept = as.numeric(summary_stats$mean[1]),
             size = 1,
             color = "#4575b4") +
  geom_vline(xintercept = as.numeric(summary_stats$mean[2]),
             size = 1,
             color = "#74add1") +
  geom_vline(xintercept = as.numeric(summary_stats$mean[3]),
             size = 1,
             color = "#f46d43") +
  geom_vline(xintercept = as.numeric(summary_stats$mean[4]),
             size = 1,
             color = "#fdae61") +
  labs(x = "Percent (%) distances < 0.1 or > 0.8",
       y = "Density") +
  ggtitle(label = "Distribution of percentages",
          subtitle = "Number of iterations: 1000") +
  theme(plot.title = element_text(hjust = 0))

ggplot(lower_data, aes(Percent_below)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.2, color = "black",
                fill = "grey80") +
  stat_function(fun = dnorm,
                args = with(lower_data, c(mean = mean(Percent_below), sd = sd(Percent_below))),
                color = "red") +
  geom_segment(aes(x = lower_data$Percent_below[1],
                   xend = lower_data$Percent_below[1], y = 0.15, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#4575b4") +
  geom_segment(aes(x = lower_data$Percent_below[2],
                   xend = lower_data$Percent_below[2], y = 0.15, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#74add1") +
  geom_segment(aes(x = lower_data$Percent_below[3],
                   xend = lower_data$Percent_below[3], y = 0.15, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#f46d43") +
  geom_segment(aes(x = lower_data$Percent_below[4],
                   xend = lower_data$Percent_below[4], y = 0.15, yend = 0),
               arrow = arrow(length = unit(0.3, "cm"),
                             type = "closed"),
               color = "#fdae61") +
  geom_text(label = "p < 0.001",
            x = 5.87,
            y = 0.32,
            angle = 90) +
  geom_vline(xintercept = mean(lower_data$Percent_below),
             size = 1) +
  labs(x = "Percent (%) distances < 0.1",
       y = "Density") +
  ggtitle(label = "Distribution of Percentages",
          subtitle = "Number of iterations: 1000") +
  theme(plot.title = element_text(hjust = 0))

summary_stats2

5 References

[1] Alikhan N-F, Zhou Z, Sergeant MJ, Achtman M. A genomic overview of the population structure of Salmonella. Casadesús J, editor. PLOS Genet [Internet]. 2018 Apr 5;14(4):e1007261. Available from: https://dx.plos.org/10.1371/journal.pgen.1007261

[2] Machado MP, Silva M, Carriço JA, Silva DN, Santos S, Ramirez M, et al. chewBBACA: A complete suite for gene-by-gene schema creation and strain identification. Microb Genomics. 2018;1–7.